##############################################################################
############################## Section 2 Data & Appendix 
############################## Subsection 2.2 Summary statistics
############################## Summary statistics
##############################################################################
# the tables are by size and exclude PD and collateral
# the exported tex files includes these two variables, but they are manually removed
# the tex files needs further manual adjustment
##############################################################################

## summary stats on debtor level ##
# OVERALL
# export in latex

## import libraries
import pandas as pd
import numpy as np
import os

# display all rows and columns
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

# data paths
path_data = r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final datasets'
path_orbis = r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final datasets\Orbis'
path_charts = r'P:\ECB business areas\DGR\Staff\papouts\Research\Active\Firm Heterogeneities\Final charts\Summary statistics'
if not os.path.isdir(path_charts):
    os.makedirs(path_charts)
 

# define describe functions for the float variables
def describe_float(data):
    # for floating numbers change only the format of the count
    tbl = data.describe()
    tbl.loc['count'] = tbl.loc['count'].apply(lambda x: '{:,.0f}'.format(x))
    tbl.iloc[1:,:] = tbl.iloc[1:,:].apply(lambda x: x.map('{:,.2f}'.format))     
    return tbl

   
# load data
# we need the summary stats only for EA11
ea11 = ['AT','BE','DE','ES','FI','FR','GR','IE','IT','NL','PT']
dbtr_lvl = pd.read_stata(path_data + r'\dbtr_lvl_dt_f.dta')
dbtr_lvl.drop(dbtr_lvl[dbtr_lvl.nace_code == ''].index, inplace=True)
dbtr_lvl.drop(dbtr_lvl[dbtr_lvl.cntry_dbtr.isin(ea11) == False].index, inplace=True)
# max debtors -> 13
dbtr_lvl.loc[dbtr_lvl['nmbr_crdtrs'] > 13, 'nmbr_crdtrs'] = 13

# format the columns
# data for the amounts should be in thousands
dbtr_lvl['ona_dbtr'] = dbtr_lvl['ona_dbtr'].apply(lambda x: round(x/1000,0))
dbtr_lvl['nmbr_crdtrs'] = dbtr_lvl['nmbr_crdtrs'].apply(lambda x: round(x,0))
dbtr_lvl['mb_shr'] = dbtr_lvl['mb_shr'].apply(lambda x: round(x,2))
dbtr_lvl['hhi'] = dbtr_lvl['hhi'].apply(lambda x: round(x,2))
t1 = dbtr_lvl[['ona_dbtr', 'nmbr_crdtrs']].describe().apply(lambda x: x.map('{:,.0f}'.format))
t2 = describe_float(dbtr_lvl[['mb_shr','hhi']])
dbtr_lvl_tbl = pd.concat([t1,t2],axis=1)
# for nempl, balance sheet and turnover we need the data that is used for the size variables
AC_final = pd.read_stata(path_data + r'\dbtr_combined_dt.dta')
AC_final.drop(AC_final[AC_final.cntry.isin(ea11) == False].index, inplace=True)

# rule for the final size
# first corrected size - AC data
# second - Orbis data
# AC size - means no data available

# merge with debtor level data
dd_merged_f = dbtr_lvl[['cntry_dbtr','dbtr_id']].merge(AC_final, how='left', right_on=['cntry', 'entty_riad_id'], left_on=['cntry_dbtr','dbtr_id'])
# check columns and missing values
dd_merged_f.drop(['cntry','entty_riad_id'],axis=1,inplace=True)
# for all final sizes using AC data only, take it
dd_merged_f['nempl_f'] = dd_merged_f['nmbr_emplys_le_c']
dd_merged_f['blnc_sht_f'] = dd_merged_f['blnc_sht_ttl_le_c']
dd_merged_f['annl_trn_f'] = dd_merged_f['annl_trnvr_le_c']
# fill in missing values with orbis data where we do not have any values for the AC size
dd_merged_f['nempl_f'][(dd_merged_f['entrprs_sz_le_corr'] == -99)] = dd_merged_f['numberofemployees']
dd_merged_f['blnc_sht_f'][(dd_merged_f['entrprs_sz_le_corr'] == -99)] = dd_merged_f['totalassets']
dd_merged_f['annl_trn_f'][(dd_merged_f['entrprs_sz_le_corr'] == -99)] = dd_merged_f['operatingrevenueturnover']
print(dd_merged_f.isna().sum())
# data for the amounts should be in thousands
dd_merged_f['nempl_f'] = dd_merged_f['nempl_f'].apply(lambda x: round(x,0))
dd_merged_f['blnc_sht_f'] = dd_merged_f['blnc_sht_f'].apply(lambda x: round(x/1000,0))
dd_merged_f['annl_trn_f'] = dd_merged_f['annl_trn_f'].apply(lambda x: round(x/1000,0))
# describe the data
orbis_tbl = dd_merged_f[['nempl_f', 'blnc_sht_f', 'annl_trn_f']].describe().applymap('{:,.0f}'.format)

# instrument level to debtor level
# exclude cases with missing outst_amnt_byinstr to match the debtor dataset 
# this does not affect the calculation of values
# load data and apply filters
instr_lvl = pd.read_stata(path_data + r'\instr_lvl_dt_f.dta')
instr_lvl.drop(instr_lvl[instr_lvl.nace_code == ''].index, inplace=True)
instr_lvl.drop(instr_lvl[instr_lvl.cntry_dbtr.isin(ea11) == False].index, inplace=True)
instr_lvl.drop(instr_lvl[instr_lvl.outst_amnt_byinstr.isna() == True].index, inplace=True)

# debtor level calculations
# interest rate
# remove missing interest rates
intrst_dt = instr_lvl.drop(instr_lvl[instr_lvl.wir_byloantype.isna() == True].index)
intrst_dt['irxona'] = intrst_dt['ona_intrate_c_ttl']*intrst_dt['wir_byloantype']
wir_dt = intrst_dt.groupby(['cntry_dbtr', 'dbtr_id','sz_f'])['ona_intrate_c_ttl', 'irxona'].sum().reset_index()
wir_dt['wir'] = wir_dt['irxona']/wir_dt['ona_intrate_c_ttl']
wir_dt['wir'] = wir_dt['wir'].apply(lambda x: round(x*100,2))
del intrst_dt
# maturity - ONLY LOANS AND CREDIT LINES
# remove missing maturity
mtrty_dt = instr_lvl.drop(instr_lvl[instr_lvl.mtrty_byloantype.isna() == True].index)
instr_lvl_slctd = mtrty_dt[mtrty_dt.typ_instr_corr.isin(['Loans','Credit lines'])].copy()
instr_lvl_slctd['mtrxona'] = instr_lvl_slctd['ona_orgnlmtrty_c_ttl']*instr_lvl_slctd['mtrty_byloantype']
mtr_dt = instr_lvl_slctd.groupby(['cntry_dbtr', 'dbtr_id','sz_f'])['ona_orgnlmtrty_c_ttl', 'mtrxona'].sum().reset_index()
mtr_dt['wmtr'] = mtr_dt['mtrxona']/mtr_dt['ona_orgnlmtrty_c_ttl']
mtr_dt['wmtr'] = mtr_dt['wmtr'].apply(lambda x: round(x,2))
del mtrty_dt, instr_lvl_slctd
# PD
# remove missing PD
pd_cl = instr_lvl.drop(instr_lvl[instr_lvl.pddbtr_byloantype.isna() == True].index)
pd_cl['pdxona'] = pd_cl['ona_pddbtr_c_ttl']*pd_cl['pddbtr_byloantype']
pd_dt = pd_cl.groupby(['cntry_dbtr', 'dbtr_id','sz_f'])['ona_pddbtr_c_ttl', 'pdxona'].sum().reset_index()
pd_dt['wpd'] = pd_dt['pdxona']/pd_dt['ona_pddbtr_c_ttl']
# put 1s to null
pd_dt['wpd_cl']  = pd_dt['wpd'] 
pd_dt['wpd_cl'][(pd_dt['wpd'] == 1)] = np.nan
pd_dt['wpd_cl'] = pd_dt['wpd_cl'].apply(lambda x: round(x*100,2))
del pd_cl
# collateral ratio
coll_dt = instr_lvl.groupby(['cntry_dbtr', 'dbtr_id','sz_f'])['outst_amnt_byinstr', 'prtctn_amnt_byinstr'].sum().reset_index()
coll_dt['coll_ratio'] = coll_dt['prtctn_amnt_byinstr']/coll_dt['outst_amnt_byinstr']
# cap all observations above 1 to 1
coll_dt['coll_ratio_cl'] = coll_dt['coll_ratio']
coll_dt['coll_ratio_cl'][(coll_dt['coll_ratio'] > 1)] = 1
coll_dt['coll_ratio_cl'][(coll_dt['outst_amnt_byinstr'] == 0)] = 1
coll_dt['coll_ratio'] = coll_dt['coll_ratio'].apply(lambda x: round(x,2))

# summary stats - combine all tables
instr_lvl_tbl = describe_float(pd.DataFrame(wir_dt['wir']))
instr_lvl_tbl = pd.concat([instr_lvl_tbl, describe_float(pd.DataFrame(mtr_dt['wmtr']))], axis=1)
instr_lvl_tbl = pd.concat([instr_lvl_tbl, describe_float(pd.DataFrame(pd_dt['wpd_cl']))], axis=1)
instr_lvl_tbl = pd.concat([instr_lvl_tbl, describe_float(pd.DataFrame(coll_dt['coll_ratio_cl']))], axis=1)
# concat the final tables
final_summary = pd.concat([dbtr_lvl_tbl,instr_lvl_tbl,orbis_tbl],axis=1)

# format table
ind = {'count':'No.','25%':'p25','50%':'p50','75%':'p75','mean':'mean'}
col = {'ona_dbtr':'Outstanding amount (in thousands)','nmbr_crdtrs':'Number of banks','mb_shr':'Share from main bank','hhi':'Herfindahl index','wir':'Interest rate (\%)','wmtr':'Maturity (in years)','nempl_f':'Number of employees','blnc_sht_f':'Balance sheet total (in thousands)','annl_trn_f':'Annual turnover (in thousands)','wpd_cl':'Probability of default (\%)','coll_ratio_cl':'Collateral ratio'}
final_summary = final_summary.loc[['count','25%','50%','75%','mean']]
final_summary.index = final_summary.index.map(ind)
final_summary.columns = final_summary.columns.map(col)

# check result
print(final_summary.to_latex(column_format='lccccc',
                           caption='Summary statistics'
                           ))
# save in a tex file
# we have a note in the text which is not included here
final_summary.to_latex(path_charts+r'\summary_stats.tex', column_format='lccccc',
                             caption='Summary statistics')

########################################################################
## TABLE BY SIZE 

# summary stats - combine all tables
# debtor level
dbtr_lvl_tbl = dbtr_lvl.groupby(['sz_f'])[['ona_dbtr', 'nmbr_crdtrs', 'mb_shr','hhi']].describe()
orbis_tbl = dd_merged_f.groupby(['sz_f'])[['nempl_f', 'blnc_sht_f', 'annl_trn_f']].describe()
# instrument level
instr_merged = wir_dt.merge(mtr_dt,how='left',on=['cntry_dbtr', 'dbtr_id', 'sz_f']).merge(pd_dt,how='left',on=['cntry_dbtr', 'dbtr_id', 'sz_f']).merge(coll_dt,how='left',on=['cntry_dbtr', 'dbtr_id', 'sz_f'])
instr_lvl_tbl = instr_merged.groupby(['sz_f'])[['wir', 'wmtr', 'wpd_cl','coll_ratio_cl']].describe()

# concat the final tables
final_summary = pd.concat([dbtr_lvl_tbl,instr_lvl_tbl,orbis_tbl],axis=1)

# format table by size
var_order = ['Outstanding amount (in thousands)','Number of banks','Share from main bank','Herfindahl index','Interest rate (\%)','Maturity (in years)','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)','Probability of default (\%)','Collateral ratio']
tbl_bysz = {}
for i in final_summary.index:
    print(i)
    tbl = final_summary.loc[i].T.reset_index()
    tbl.columns = ['var','stat','size']
    tbl_r = tbl.pivot(index='stat',values='size',columns='var')
    ind = {'count':'No.','25%':'p25','50%':'p50','75%':'p75','mean':'mean'}
    col = {'ona_dbtr':'Outstanding amount (in thousands)','nmbr_crdtrs':'Number of banks','mb_shr':'Share from main bank','hhi':'Herfindahl index','wir':'Interest rate (\%)','wmtr':'Maturity (in years)','nempl_f':'Number of employees','blnc_sht_f':'Balance sheet total (in thousands)','annl_trn_f':'Annual turnover (in thousands)','wpd_cl':'Probability of default (\%)','coll_ratio_cl':'Collateral ratio'}
    s = tbl_r.loc[['count','25%','50%','75%','mean']]
    s.index = s.index.map(ind)
    s.columns = s.columns.map(col)
    # we have to apply the styles independently
    s.loc[:,['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)']] = s.loc[:,['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)']].apply(lambda x: round(x,0))
    s.loc[:,['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)']] = s.loc[:,['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)']].apply(lambda x: x.map('{:,.0f}'.format))
    s.loc['No.',~s.columns.isin(['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)'])] = s.loc['No.',~s.columns.isin(['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)'])].apply(lambda x: '{:,.0f}'.format(x))
    s.loc[~s.index.isin(['No.']),~s.columns.isin(['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)'])] = s.loc[~s.index.isin(['No.']),~s.columns.isin(['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)'])].apply(lambda x: x.map('{:,.2f}'.format))
        
    tbl_bysz[i] = s[var_order].T

for i in tbl_bysz.keys():
    print(i)
    tbl_bysz[i]['Firm size'] = i

# dict to table    
tbl_bysz_dt = pd.concat(tbl_bysz.values(), ignore_index=False) 
tbl_bysz_dt.reset_index(inplace=True)
tbl_bysz_dt = tbl_bysz_dt[['Firm size','var','No.', 'p25', 'p50', 'p75', 'mean']]
# copy this into Overleaf table because we need additional formatting
# save table 
# the order is from large to micro
with open(path_charts+r"\summary_stats_bysz.tex", "w") as f:
    f.write(tbl_bysz_dt.to_latex(index=False,column_format='llccccc',
                             caption='Summary statistics by firm size - large, medium, small, micro'))


########################################################################
## TABLE BY SIZE AND COUNTRY

# summary stats - combine all tables
# debtor level
dbtr_lvl_tbl = dbtr_lvl.groupby(['cntry_dbtr','sz_f'])[['ona_dbtr', 'nmbr_crdtrs', 'mb_shr','hhi']].describe()
orbis_tbl = dd_merged_f.groupby(['cntry_dbtr','sz_f'])[['nempl_f', 'blnc_sht_f', 'annl_trn_f']].describe()
instr_merged = wir_dt.merge(mtr_dt,how='left',on=['cntry_dbtr', 'dbtr_id', 'sz_f']).merge(pd_dt,how='left',on=['cntry_dbtr', 'dbtr_id', 'sz_f']).merge(coll_dt,how='left',on=['cntry_dbtr', 'dbtr_id', 'sz_f'])
instr_lvl_tbl = instr_merged.groupby(['cntry_dbtr','sz_f'])[['wir', 'wmtr', 'wpd_cl','coll_ratio_cl']].describe()

# concat the final tables
final_summary = pd.concat([dbtr_lvl_tbl,instr_lvl_tbl,orbis_tbl],axis=1)

# format table by size
var_order = ['Outstanding amount (in thousands)','Number of banks','Share from main bank','Herfindahl index','Interest rate (\%)','Maturity (in years)','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)','Probability of default (\%)','Collateral ratio']
tbl_bycntry = {}
for i in final_summary.index:
    print(i)
    tbl = final_summary.loc[i].T.reset_index()
    tbl.columns = ['var','stat','size']
    tbl_r = tbl.pivot(index='stat',values='size',columns='var')
    ind = {'count':'No.','25%':'p25','50%':'p50','75%':'p75','mean':'mean'}
    col = {'ona_dbtr':'Outstanding amount (in thousands)','nmbr_crdtrs':'Number of banks','mb_shr':'Share from main bank','hhi':'Herfindahl index','wir':'Interest rate (\%)','wmtr':'Maturity (in years)','nempl_f':'Number of employees','blnc_sht_f':'Balance sheet total (in thousands)','annl_trn_f':'Annual turnover (in thousands)','wpd_cl':'Probability of default (\%)','coll_ratio_cl':'Collateral ratio'}
    s = tbl_r.loc[['count','25%','50%','75%','mean']]
    s.index = s.index.map(ind)
    s.columns = s.columns.map(col)
    # we have to apply the styles independently
    s.loc[:,['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)']] = s.loc[:,['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)']].apply(lambda x: round(x,0))
    s.loc[:,['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)']] = s.loc[:,['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)']].apply(lambda x: x.map('{:,.0f}'.format))
    s.loc['No.',~s.columns.isin(['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)'])] = s.loc['No.',~s.columns.isin(['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)'])].apply(lambda x: '{:,.0f}'.format(x))
    s.loc[~s.index.isin(['No.']),~s.columns.isin(['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)'])] = s.loc[~s.index.isin(['No.']),~s.columns.isin(['Outstanding amount (in thousands)','Number of banks','Number of employees','Balance sheet total (in thousands)','Annual turnover (in thousands)'])].apply(lambda x: x.map('{:,.2f}'.format))
        
    tbl_bycntry[i] = s[var_order].T

for i in tbl_bycntry.keys():
    print(i)
    tbl_bycntry[i]['Country'] = i[0]    
    tbl_bycntry[i]['Firm size'] = i[1]

# save table 
tbl_bycntry_dt = pd.concat(tbl_bycntry.values(), ignore_index=False)
tbl_bycntry_dt.reset_index(inplace=True)
tbl_bycntry_dt = tbl_bycntry_dt[['Country','Firm size', 'var','No.', 'p25', 'p50', 'p75', 'mean']]
with open(path_charts+r"\summary_stats_bysz_bycntry.tex", "w") as f:
    f.write(tbl_bycntry_dt.to_latex(index=False, column_format='lllccccc',
                             caption='Summary statistics by firm size and country - large, medium, small, micro'))


########################################################################
## TABLE BY INSTRUMENT TYPES
# we need a summary of the shares by loan types on debtor level
# get debtor level ONA
instr_lvl_ona = instr_lvl.groupby(['cntry_dbtr', 'dbtr_id'])['outst_amnt_byinstr'].sum().reset_index().rename(columns={'outst_amnt_byinstr':'ona_dbtr'})
# calculate the shares
instr_lvl = instr_lvl[['cntry_dbtr', 'dbtr_id', 'nace_code', 'sz_f','typ_instr_corr','outst_amnt_byinstr']].merge(instr_lvl_ona,how='left',on=['cntry_dbtr', 'dbtr_id'])
instr_lvl.columns
instr_lvl.ona_dbtr.describe()
instr_lvl['loan_shr'] = instr_lvl['outst_amnt_byinstr']/instr_lvl['ona_dbtr']
instr_lvl['loan_shr'] = instr_lvl['loan_shr'].apply(lambda x: round(x,2))
# OVERALL
# apply formatting
final_summary = instr_lvl.groupby(['typ_instr_corr'])['loan_shr'].describe().T
final_summary.loc['count'] = final_summary.loc['count'].apply(lambda x: '{:,.0f}'.format(x))
final_summary.iloc[1:,:] = final_summary.iloc[1:,:].apply(lambda x: x.map('{:,.2f}'.format)) 
# order index and columns
final_summary = final_summary.loc[['count','25%','50%','75%','mean']]
final_summary = final_summary[['Loans','Credit lines','Finance leases','Trade receivables','Revolving credit']]
# rename index
ind = {'count':'No.','25%':'p25','50%':'p50','75%':'p75','mean':'mean'}
final_summary.index = final_summary.index.map(ind)

# check result
print(final_summary.T.to_latex(column_format='lccccc',
                           caption='Summary statistics'
                           ))
# save in a tex file
final_summary.T.to_latex(path_charts+r'\summary_stats_loanshr.tex', column_format='lccccc',
                             caption='Summary statistics')
# BY SIZE
final_summary = instr_lvl.groupby(['sz_f','typ_instr_corr'])['loan_shr'].describe().T
final_summary.loc['count'] = final_summary.loc['count'].apply(lambda x: '{:,.0f}'.format(x))
final_summary.iloc[1:,:] = final_summary.iloc[1:,:].apply(lambda x: x.map('{:,.2f}'.format)) 
# order index and columns
final_summary = final_summary.loc[['count','25%','50%','75%','mean']]
final_summary = final_summary.reindex(columns=['Loans','Credit lines','Finance leases','Trade receivables','Revolving credit'], level=1)
# rename index
ind = {'count':'No.','25%':'p25','50%':'p50','75%':'p75','mean':'mean'}
final_summary.index = final_summary.index.map(ind)

# save in a tex file
final_summary.T.to_latex(path_charts+r'\summary_stats_loanshr_bysz.tex', column_format='llccccc',
                             caption='Summary statistics by firm size - large, medium, small, micro')

# BY SIZE AND COUNTRY
final_summary = instr_lvl.groupby(['cntry_dbtr','sz_f','typ_instr_corr'])['loan_shr'].describe().T
final_summary.loc['count'] = final_summary.loc['count'].apply(lambda x: '{:,.0f}'.format(x))
final_summary.iloc[1:,:] = final_summary.iloc[1:,:].apply(lambda x: x.map('{:,.2f}'.format)) 
# order index and columns
final_summary = final_summary.loc[['count','25%','50%','75%','mean']]
final_summary = final_summary.reindex(columns=['Loans','Credit lines','Finance leases','Trade receivables','Revolving credit'], level=2)
# rename index
ind = {'count':'No.','25%':'p25','50%':'p50','75%':'p75','mean':'mean'}
final_summary.index = final_summary.index.map(ind)
 
# save in a tex file
final_summary.T.to_latex(path_charts+'\summary_stats_loanshr_bysz_bycntry.tex', column_format='lllccccc',
                             caption='Summary statistics by firm size and country - large, medium, small, micro')




